install.packages('plotly')
install.packages('colorBlindness')
devtools::install_github('jo-m-lab/ARBOL') # ARBOL is used to plot clusters.
devtools::install_github('immunogenomics/presto') # presto is used to speed up Wilcoxon tests for marker gene calculation in Seurat1 - Cellxgene Census
Libraries
Installation
suppressPackageStartupMessages({
library(Seurat)
library(tidyverse)
library(ARBOL)
library(plotly)
library(colorBlindness)
library(RColorBrewer)
library(vegan)
library(cluster)
})Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'
Warning: package 'glmGamPoi' was built under R version 4.3.2
set.seed(687)Load data
srobj <- readRDS('/Users/kylekimler/Downloads/d8e35450-de43-451a-9979-276eac688bce.rds')
genes <- read_csv('/Users/kylekimler/CDN/workshops/workshop1/data/cov_flu_gene_names_table.csv')Rows: 33234 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): index, feature_name
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Need to remake seurat object
mtx <- srobj@assays$RNA@data
rownames(mtx) <- genes[match(row.names(mtx),genes$index),]$feature_name
srobj <- CreateSeuratObject(counts = mtx, meta.data = srobj@meta.data)Warning: Feature names cannot have underscores ('_'), replacing with dashes
('-')
srobjAn object of class Seurat
33234 features across 59572 samples within 1 assay
Active assay: RNA (33234 features, 0 variable features)
1 layer present: counts
Generate a color palette for our cell types
# https://www.datanovia.com/en/blog/easy-way-to-expand-color-palettes-in-r/
# nb.cols <- length(unique(se$Celltype))
# mycolors <- colorRampPalette(paletteMartin)(nb.cols)
pal <- paletteMartin
names(pal) <- sort(unique(srobj$Celltype))Seurat pre-processing for cluster visualization (UMAP)
srobj <- srobj %>%
NormalizeData(verbose = FALSE) %>%
FindVariableFeatures(
method = "vst",
nfeatures = 3000,
verbose = FALSE) %>%
ScaleData(verbose = FALSE) %>%
RunPCA(verbose = FALSE) %>%
RunUMAP(dims = 1:30, verbose = FALSE, n.components=3L)Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'
Viewing clusters
Typically in scRNA analysis, clusters are viewed by coloring cells on a dimension-reduced latent space of gene expression, like a tSNE or a UMAP. In the dataset we’ve downloaded, we have a tSNE already calculated, so let’s display the authors’ celltypes there first.
By plotting many tSNE’s per sample or per group, we can start to understand how clusters behave across samples.
DimPlot(srobj,
reduction='umap',
group.by='Celltype',
cols = pal)DimPlot(srobj,
reduction='umap',
group.by='Celltype',
split.by='Sample ID',
ncol=4,
cols = pal)For example, we can see the sample from flu patient 5 is dominated by erythrocytes, and these erythrocytes aren’t reliably found across samples, whether diseased or normal. But that doesn’t tell us much about the clusters themselves.
And these celltypes are often validated by plotting heatmaps, dotplots, or violin plots of genes that are most important to each cluster, found by 1 v all differential expression.
Idents(srobj) <- srobj@meta.data$Celltype
celltype_markers <- FindAllMarkers(srobj,
only.pos=TRUE,
logfc.threshold=0.25,
min.diff.pct=0.05,
)Calculating cluster B cell, IgG+
Calculating cluster B cell, IgG-
Calculating cluster CD4, EM-like
Calculating cluster CD4, non-EM-like
Calculating cluster CD8, EM-like
Calculating cluster CD8, non-EM-like
Calculating cluster DC
Calculating cluster NK cell
Calculating cluster Platelet
Calculating cluster RBC
Calculating cluster Uncategorized1
Calculating cluster Uncategorized2
Calculating cluster classical Monocyte
Calculating cluster intermediate Monocyte
Calculating cluster nonclassical Monocyte
top_cluster_features <- celltype_markers %>% group_by(cluster) %>%
filter(p_val_adj==0) %>%
slice_max(avg_log2FC,n=10)
DoHeatmap(srobj, features=top_cluster_features$gene)Warning in DoHeatmap(srobj, features = top_cluster_features$gene): The
following features were omitted as they were not found in the scale.data slot
for the RNA assay: WLS, CLEC2L, RP11-501J20.5, KIR3DL1, RNF165, BNC2, LGALS9B,
KIF19, MSC, RP11-23P13.6, CHRM3-AS2, FAM153A, LINC00402, TCEA3, NOG, CCR4,
WNT7A, PARP11-AS1, PI16, ABCB4, SCN3A, TCL1B, COL4A4, PLEKHG7
And there are quite a few methods for displaying genes across clusters detailed in Seurat vignettes
Beyond the norm
As we can see, the cell types are somewhat well defined but as usual there are some fuzzy boundaries. Some people like to go 3d to see if these boundaries are resolved.
emb <- Seurat::Embeddings(srobj,reduction='umap')
emb <- emb %>% as.data.frame %>% rownames_to_column('CellID') %>%
left_join(srobj@meta.data %>% rownames_to_column("CellID"))Joining with `by = join_by(CellID)`
suppressMessages({
p <- plot_ly(emb, type='scatter3d',
color = ~Celltype,
x = ~umap_1,
y = ~umap_2,
z = ~umap_3,
colors = pal)
p
})No scatter3d mode specifed:
Setting the mode to markers
Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
But even with 3d we don’t have a way to quantify how useful our clusters are as labels in our dataset. Firstly, with a UMAP or tSNE, we can’t see how well represented samples are across clusters. Some people build cluster-composition bar graphs for this, but these can be difficult to compare, and they don’t show us how large the clusters are. One solution is to make a plot of cluster metrics per cluster. Let’s start with cell counts and sample diversity.
Simpson’s diversity
diversityPerGroup <- function(df, species, group, diversity_metric = 'simpson') {
# Convert strings to symbols for curly-curly operator
species_sym <- rlang::sym(species)
group_sym <- rlang::sym(group)
# Count groups per species directly using curly-curly
tierNcount <- df %>%
group_by({{species_sym}}) %>%
count({{group_sym}}, name = "n") %>% ungroup
# Pivot table to allow vegan::diversity call
tierNwide <- tierNcount %>%
pivot_wider(names_from = {{group_sym}}, values_from = n, values_fill = list(n = 0))
# Use rownames of species for the diversity function, which needs a dataframe
tierNwide_df <- as.data.frame(tierNwide)
# species column must be first
tierNwide_df <- tierNwide_df %>% select({{species}}, everything())
rownames(tierNwide_df) <- tierNwide_df[, 1]
tierNwide_df <- tierNwide_df[, -1]
# Calculate diversity
diversity_values <- vegan::diversity(tierNwide_df, index = diversity_metric)
# Prepare result as a dataframe
result <- data.frame(
species = rownames(tierNwide_df),
diversity = diversity_values,
row.names = NULL
)
# Rename diversity column
names(result)[1] <- species
names(result)[2] <- sprintf('%s_diversity', group)
return(result)
}
# Calculate simpson's diversity per cluster
clusterMetrics <- diversityPerGroup(srobj@meta.data,
species = 'Celltype',
group = 'Sample ID',
diversity_metric = 'simpson')
# Calculate number of cells per cluster and join to metrics table
clusterMetrics <- clusterMetrics %>% left_join(srobj@meta.data %>% count(Celltype))Joining with `by = join_by(Celltype)`
# clusterMetrics
# p1 <- ggplot(clusterMetrics, aes(x = Celltype, y = n)) +
# geom_bar(stat = "identity", fill = 'black') +
# scale_y_log10() +
# labs(x = "Cell Type", y = "Cell Number (log scale)") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(clusterMetrics, aes(y=Celltype, fill=`Sample ID_diversity`, x = 1, label = n)) +
geom_tile(colour = "white") +
geom_text(nudge_x = 1, size = 6) +
geom_text(aes(label = signif(`Sample ID_diversity`, 3)),size = 6) +
scale_fill_distiller(palette = "Spectral", limits = c(0,1)) + theme_classic() +
coord_fixed(ratio = 1) +
expand_limits(x = c(0.5,3)) +
labs(x = "") +
theme(axis.text.y = element_text(hjust = 1, vjust = 1, size = 20),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 20),
legend.key.size = unit(1, 'cm'),
legend.title = element_text(size=10),
legend.text = element_text(size=10)
)
p2So in this paper it looks like our clusters are in fact well represented among every sample. None of them are particularly small, but the classical Monocyte cluster contains > 15000 cells. There may be some additional clusters hiding there.
In addition to cell numbers and sample representation, it can be useful to get an idea of how tightly clustered cells are in each cluster, and how distant each cluster is to the others. This way we can quantify how fuzzy the borders are between clusters. Many clustering metrics exist to answer this question (Lim and Qiu 2024) - one popular metric is the average silhouette distance.
Celltypes <- srobj@meta.data$Celltype
pca_embeddings <- Embeddings(srobj, reduction = 'pca')
# Calculate silhouette widths
# sil_widths <- silhouette(x = cluster_assignments, dist = dist(pca_embeddings))
sil_scores <- silhouette(x = as.numeric(Celltypes), dist = dist(pca_embeddings))
# Extract silhouette scores
silhouette_data <- as.data.frame(sil_scores)
# Recover cell type names
silhouette_data$Celltype <- as.character(Celltypes)
silhouette_arranged <- silhouette_data %>% group_by(Celltype) %>% arrange(-sil_width)
silhouette_averages <- silhouette_arranged %>% summarize(avg = mean(sil_width))
avg_silhouettes_plot <- ggplot(silhouette_averages, aes(y = Celltype, x = avg, fill = Celltype, group = Celltype)) +
geom_bar(stat = "identity", position = 'dodge2') +
theme_minimal() +
labs(title = "Average Silhouettes",
y = "Cluster",
x = "Average Silhouette width",
fill = "Cluster") +
theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 20),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 20),
legend.position = "None") +
scale_fill_manual(values = pal)
avg_silhouettes_plotoverall_average <- silhouette_arranged %>% ungroup %>% summarize(ave = as.numeric(mean(sil_width))) %>% pull(ave)
full_silhouettes_plot <- ggplot(silhouette_arranged, aes(y = Celltype, x = sil_width, fill = Celltype, group = Celltype)) +
geom_bar(stat = "identity", position = 'dodge2') +
geom_vline(xintercept = overall_average,
color = 'red',
linetype = 'longdash') +
theme_minimal() +
labs(title = "Silhouette Analysis",
y = "Cluster",
x = "Silhouette width",
fill = "Cluster") +
theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 20),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 20),
legend.position = "None") +
scale_fill_manual(values = pal)
full_silhouettes_plotWith ARBOL, we can build a dendrogram of cluster centroid distances based on gene expression or some dimensionality reduction in the data.
# ARBOL requires 3 metadata entries
srobj@meta.data$tierNident <- srobj@meta.data$Celltype
srobj@meta.data$CellID <- row.names(srobj@meta.data)
srobj@meta.data$sample <- srobj@meta.data$`Sample ID`
srobj <- ARBOLcentroidTaxonomy(srobj,
tree_reduction='pca'
)Bootstrap (r = 0.48)... Done.
Bootstrap (r = 0.6)... Done.
Bootstrap (r = 0.68)... Done.
Bootstrap (r = 0.8)... Done.
Bootstrap (r = 0.88)... Done.
Bootstrap (r = 1.0)... Done.
Bootstrap (r = 1.08)... Done.
Bootstrap (r = 1.2)... Done.
Bootstrap (r = 1.28)... Done.
Bootstrap (r = 1.4)... Done.
ggraph(srobj@misc$tax_ggraph, layout = 'dendrogram', height=plotHeight) +
geom_edge_elbow() +
geom_node_text(aes(filter = leaf, label = name, color = name), nudge_y=4,vjust=0.5,hjust=0,size=5) +
scale_color_manual(values = pal) +
geom_node_text(aes(filter = leaf, label = n),color='grey30',nudge_y=1,vjust=0.5,hjust=0,size=4) +
theme_void() +
new_scale('color') +
geom_node_point(aes(filter = leaf, color=sample_diversity),size=4,shape='square') +
scale_color_gradient(low='grey90',high='grey10',limits=c(0,1)) +
coord_flip() + scale_y_reverse() +
expand_limits(y=-20)